library(gdata)
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
library(kableExtra)
library(flextable)
library(tidyverse)
library(stringr)
library(lubridate)
library(scales)
library(graphics)
library(caret)
library(psych)
library(ggcorrplot)
library(PerformanceAnalytics)Multiple Linear Regression
Red Wine Quality Prediction
1 Setting up the environment
2 Importing Red Wine Quality Dataset
The goal is to model red wine quality based on physicochemical characteristics.
redDF <- read.csv("../data/winequality-red.csv",sep=",")
head(redDF) %>% kbl()| fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | quality |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25 | 67 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15 | 54 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17 | 60 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13 | 40 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
3 Preparing for Machine Learning
Let’s perform the exploratory data analysis and prepare the data for machine learning. Use scatter plots, histograms, correlation values, and p-value. You can use the pairs.panels function in psych library and ggcorrplot to get pairwise correlations among variables, but I will use the PerformanceAnalytics package.
3.1 Using Standard Psych Package
library(psych)
pairs.panels(redDF)3.2 Using GGalley Package
library(GGally)
ggpairs(redDF, columns = c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar", "chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density", "pH", "sulphates", "alcohol"),
aes(color = as.factor(quality), alpha = 0.5),
upper = list(continuous = wrap("cor", size = 2.5)))+ scale_colour_brewer(palette = "Set3",type = 'div',direction = 1)3.3 Correlation Matrix with GGcorrplot Package
ggcorrplot(redDF %>% cor(),
hc.order = TRUE, type = "lower",
lab = TRUE,
digits = 2,
ggtheme = ggplot2::theme_light(),
)3.4 Correlation Matrix with base Package
chart.Correlation(redDF, hist = T)3.5 Target variable distribution
summary(redDF$quality)#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 3.000 5.000 6.000 5.636 6.000 8.000
feqtab <- as_flextable(table(redDF$quality))
feqtabVar1 | Count | Percent |
|---|---|---|
3 | 10 | 0.6% |
4 | 53 | 3.3% |
5 | 681 | 42.6% |
6 | 638 | 39.9% |
7 | 199 | 12.4% |
8 | 18 | 1.1% |
Total | 1,599 | 100.0% |
The dependent variable seems ordinal; hence, linear regression is unsuitable for this problem.
3.5.1 Correlation Values
Strong correlation values >0.60 between some variables. We’ll verify this again in the assumptions section.
3.6 Data Preparation
3.6.1 Checking for Missing Values
redDF %>% is.na() %>% colSums()#> fixed.acidity volatile.acidity citric.acid
#> 0 0 0
#> residual.sugar chlorides free.sulfur.dioxide
#> 0 0 0
#> total.sulfur.dioxide density pH
#> 0 0 0
#> sulphates alcohol quality
#> 0 0 0
I am using the naniar package to check for missing values and plot them.
# Load the required libraries
library(naniar)
# Create a missing value plot
vis_miss(redDF)3.6.2 Checking for Outliers
Let’s perform an outlier analysis using the boxplot function. This will output the outlier numbers and will provide the boxplot for each variable.
# Specify the variable for which you want to find outliers
variable_of_interest <- redDF$fixed.acidity
# Calculate the boxplot statistics
boxplot_stats <- boxplot.stats(variable_of_interest)
# Get the indexes of outliers
outlier_indexes <- which(variable_of_interest %in% boxplot_stats$out)Here are the outlier indexes: 206, 207, 244, 245, 265, 295, 329, 339, 340, 348, 354, 360, 364, 365, 367, 375, 382, 392, 395, 410, 430, 441, 443, 447, 471, 473, 510, 511, 517, 539, 545, 549, 555, 556, 558, 560, 561, 565, 566, 597, 600, 602, 604, 612, 653, 681, 812, 815, 1225
# Load the required libraries
library(reshape2)
melted_data <- reshape2::melt(redDF, id.vars = 'quality')
# Create a grouped box plot
ggplot(melted_data, aes(x = variable, y = value)) +
geom_boxplot(aes(fill = as.factor(quality))) +
labs(title = "Grouped Box Plot of Variables vs. Quality") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("Variable")3.6.3 Train/test Split
set.seed(1)
sampleSize <- round(nrow(redDF)*0.8)
idx <- sample(seq_len(sampleSize), size = sampleSize)
X.train_red <- redDF[idx,]
X.test_red <- redDF[-idx,]
# X.test_red.y <- X.test_red
# X.test_red <- subset(X.test_red, select = -c(quality) )
rownames(X.train_red) <- NULL
rownames(X.test_red) <- NULL4 Modeling
4.1 Building the Initial Model
We won’t consider ‘residual sugar’ for our analysis.
model_red1 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + free.sulfur.dioxide +
total.sulfur.dioxide + density + pH + sulphates + alcohol,
data = X.train_red)
stargazer::stargazer(model_red1, type = "html", out = "regression.html" ,title = "Simple Linear Model", ci=TRUE, single.row = FALSE, no.space = FALSE, align = TRUE, digits=3, font.size = "small", report = "vc*stp")| Dependent variable: | |
| quality | |
| fixed.acidity | 0.015 |
| (-0.037, 0.068) | |
| t = 0.574 | |
| p = 0.567 | |
| volatile.acidity | -1.057*** |
| (-1.323, -0.790) | |
| t = -7.780 | |
| p = 0.000 | |
| citric.acid | -0.177 |
| (-0.502, 0.147) | |
| t = -1.070 | |
| p = 0.285 | |
| chlorides | -1.779*** |
| (-2.686, -0.872) | |
| t = -3.845 | |
| p = 0.0002 | |
| free.sulfur.dioxide | 0.003 |
| (-0.001, 0.008) | |
| t = 1.368 | |
| p = 0.172 | |
| total.sulfur.dioxide | -0.004*** |
| (-0.005, -0.002) | |
| t = -4.444 | |
| p = 0.00001 | |
| density | -11.025 |
| (-49.319, 27.269) | |
| t = -0.564 | |
| p = 0.573 | |
| pH | -0.383* |
| (-0.777, 0.010) | |
| t = -1.908 | |
| p = 0.057 | |
| sulphates | 0.794*** |
| (0.556, 1.033) | |
| t = 6.527 | |
| p = 0.000 | |
| alcohol | 0.292*** |
| (0.244, 0.341) | |
| t = 11.878 | |
| p = 0.000 | |
| Constant | 15.096 |
| (-22.377, 52.569) | |
| t = 0.790 | |
| p = 0.430 | |
| Observations | 1,279 |
| R2 | 0.369 |
| Adjusted R2 | 0.364 |
| Residual Std. Error | 0.648 (df = 1268) |
| F Statistic | 74.295*** (df = 10; 1268) |
| Note: | p<0.1; p<0.05; p<0.01 |
4.2 Model Assumptions
4.2.1 Checking Normality of Residuals
CheckNormal <- function(model) {
hist(model$residuals, breaks = 30)
shaptest <- shapiro.test(model$residuals)
print(shaptest)
if (shaptest$p.value <= 0.05) {
print("H0 rejected: the residuals are NOT distributed normally")
} else {
print("H0 failed to reject: the residuals ARE distributed normally")
}
}
CheckNormal(model = model_red1)#>
#> Shapiro-Wilk normality test
#>
#> data: model$residuals
#> W = 0.9897, p-value = 7.95e-08
#>
#> [1] "H0 rejected: the residuals are NOT distributed normally"
4.2.2 Checking Homoscedasticity
library(lmtest)
CheckHomos <- function(model){
plot(model$fitted.values, model$residuals)
abline(h = 0, col = "red")
BP <- bptest(model)
print(BP)
if (BP$p.value <= 0.05) {
print("H0 rejected: Error variance spreads INCONSTANTLY/generating patterns (Heteroscedasticity)")
} else {
print("H0 failed to reject: Error variance spreads CONSTANTLY (Homoscedasticity)")
}
}
CheckHomos(model = model_red1)#>
#> studentized Breusch-Pagan test
#>
#> data: model
#> BP = 56.615, df = 10, p-value = 1.575e-08
#>
#> [1] "H0 rejected: Error variance spreads INCONSTANTLY/generating patterns (Heteroscedasticity)"
4.2.3 Checking Multicollinearity
library(car)
vif(model_red1) %>% kbl()| x | |
|---|---|
| fixed.acidity | 6.935469 |
| volatile.acidity | 1.779712 |
| citric.acid | 3.229450 |
| chlorides | 1.497186 |
| free.sulfur.dioxide | 1.996028 |
| total.sulfur.dioxide | 2.312040 |
| density | 4.168414 |
| pH | 2.962262 |
| sulphates | 1.368089 |
| alcohol | 2.218354 |
4.3 Model Improvements
4.3.1 Outlier Diagnosis
par(mfrow=c(2,2))
lapply(c(1,2,4,5),
function(x) plot(model_red1,
which = x,
cook.levels = c(0.05, 0.1))) %>% invisible()4.3.2 Removing Influential Observations and Model Rerun
to.rm <- c(78,202,245,274,1161)
X.train_red <- X.train_red[-to.rm,]
rownames(X.train_red) <- NULL
model_red2 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + chlorides +
free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol,
data = X.train_red)
stargazer::stargazer(model_red1,model_red2, type = "html" ,title = "Regression Models for Red Wine Dataset", ci=TRUE, single.row = FALSE, no.space = FALSE, align = TRUE, digits=5, font.size = "small", report = "vc*stp")| Dependent variable: | ||
| quality | ||
| (1) | (2) | |
| fixed.acidity | 0.01547 | 0.02520 |
| (-0.03738, 0.06832) | (-0.02735, 0.07775) | |
| t = 0.57368 | t = 0.93974 | |
| p = 0.56629 | p = 0.34753 | |
| volatile.acidity | -1.05662*** | -1.03840*** |
| (-1.32279, -0.79045) | (-1.29974, -0.77706) | |
| t = -7.78041 | t = -7.78777 | |
| p = 0.00000 | p = 0.00000 | |
| citric.acid | -0.17735 | -0.20137 |
| (-0.50207, 0.14737) | (-0.52188, 0.11914) | |
| t = -1.07045 | t = -1.23139 | |
| p = 0.28463 | p = 0.21841 | |
| chlorides | -1.77905*** | -1.49611*** |
| (-2.68588, -0.87222) | (-2.40605, -0.58617) | |
| t = -3.84511 | t = -3.22253 | |
| p = 0.00013 | p = 0.00131 | |
| free.sulfur.dioxide | 0.00339 | 0.00391 |
| (-0.00147, 0.00825) | (-0.00088, 0.00870) | |
| t = 1.36756 | t = 1.59961 | |
| p = 0.17170 | p = 0.10994 | |
| total.sulfur.dioxide | -0.00364*** | -0.00355*** |
| (-0.00525, -0.00204) | (-0.00514, -0.00196) | |
| t = -4.44442 | t = -4.38488 | |
| p = 0.00001 | p = 0.00002 | |
| density | -11.02496 | -14.88229 |
| (-49.31898, 27.26907) | (-52.51200, 22.74741) | |
| t = -0.56428 | t = -0.77515 | |
| p = 0.57267 | p = 0.43840 | |
| pH | -0.38348* | -0.38371* |
| (-0.77738, 0.01041) | (-0.77262, 0.00521) | |
| t = -1.90814 | t = -1.93370 | |
| p = 0.05660 | p = 0.05338 | |
| sulphates | 0.79450*** | 0.89072*** |
| (0.55593, 1.03307) | (0.64894, 1.13249) | |
| t = 6.52710 | t = 7.22053 | |
| p = 0.00000 | p = 0.00000 | |
| alcohol | 0.29243*** | 0.29981*** |
| (0.24418, 0.34069) | (0.25229, 0.34733) | |
| t = 11.87815 | t = 12.36589 | |
| p = 0.00000 | p = 0.00000 | |
| Constant | 15.09573 | 18.68649 |
| (-22.37738, 52.56883) | (-18.12650, 55.49948) | |
| t = 0.78956 | t = 0.99489 | |
| p = 0.42994 | p = 0.31999 | |
| Observations | 1,279 | 1,274 |
| R2 | 0.36945 | 0.38754 |
| Adjusted R2 | 0.36448 | 0.38269 |
| Residual Std. Error | 0.64763 (df = 1268) | 0.63437 (df = 1263) |
| F Statistic | 74.29508*** (df = 10; 1268) | 79.91719*** (df = 10; 1263) |
| Note: | p<0.1; p<0.05; p<0.01 | |
4.3.3 Model Comparison
ad.r.sq1 <- summary(model_red1)$adj.r.squared
ad.r.sq2 <- summary(model_red2)$adj.r.squaredAdjusted R-squared for 1st and the second models are 0.3644797 and 0.3826897, respectively. The difference between both is 1.821%.
5 Feature Selection
5.1 Use stepwise algorithm: Forward, Backward and Step-wise
First, let’s define the start and stop thresholds for the algorithm. We will also use the regsubset function from the leaps package to perform the stepwise regression.
model_redAlc <- lm(quality ~ alcohol, data = X.train_red)
model_redAll <- lm(quality ~ ., data = X.train_red)5.1.1 Backward Approach
5.1.1.1 Variable Selection Sequence
The variable selection dataframe below will show which variables were forced in and/or forced out for the backward approach.
OLS.regback <- leaps::regsubsets(quality ~ ., X.train_red,
method = "backward", nvmax = 11)
OLSregbacksum <- summary(OLS.regback)
Varselect <- data.frame(Forcein = OLSregbacksum$obj$force.in,
Forceout = OLSregbacksum$obj$force.out)
Varselect %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F,
position = "center", font_size = 9)| Forcein | Forceout | |
|---|---|---|
| TRUE | FALSE | |
| fixed.acidity | FALSE | FALSE |
| volatile.acidity | FALSE | FALSE |
| citric.acid | FALSE | FALSE |
| residual.sugar | FALSE | FALSE |
| chlorides | FALSE | FALSE |
| free.sulfur.dioxide | FALSE | FALSE |
| total.sulfur.dioxide | FALSE | FALSE |
| density | FALSE | FALSE |
| pH | FALSE | FALSE |
| sulphates | FALSE | FALSE |
| alcohol | FALSE | FALSE |
5.1.1.2 Model Output Matrix
We also need to see the variables included in each model. The table below will show that model 1 only includes alcohol while model 2 includes volatile.acidity and alcohol.
exhselect <- data.frame(OLSregbacksum$outmat)
exhselect %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F,
position = "center", font_size = 9)| fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 ( 1 ) | * | ||||||||||
| 2 ( 1 ) | * | * | |||||||||
| 3 ( 1 ) | * | * | * | ||||||||
| 4 ( 1 ) | * | * | * | * | |||||||
| 5 ( 1 ) | * | * | * | * | * | ||||||
| 6 ( 1 ) | * | * | * | * | * | * | |||||
| 7 ( 1 ) | * | * | * | * | * | * | * | ||||
| 8 ( 1 ) | * | * | * | * | * | * | * | * | |||
| 9 ( 1 ) | * | * | * | * | * | * | * | * | * | ||
| 10 ( 1 ) | * | * | * | * | * | * | * | * | * | * | |
| 11 ( 1 ) | * | * | * | * | * | * | * | * | * | * | * |
Now, let’s look at the detailed model measurement metrics, including The R^2, Adjusted-R^2, Mellow’s Cp, AIC, and BIC. These can be directly extracted from the OLS regression summary we have created in the first step.
5.1.1.3 R^2
rsqmat <- data.frame(t(OLSregbacksum$rsq))
rsqmat %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F,
position = "center", font_size = 9)| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.2596942 | 0.3406592 | 0.3608397 | 0.3743383 | 0.3799897 | 0.3847493 | 0.3865018 | 0.3866865 | 0.3871627 | 0.3876856 | 0.3884823 |
5.1.1.4 Adjusted R^2
adjr2mat <- data.frame(t(OLSregbacksum$adjr2))
adjr2mat %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F,
position = "center", font_size = 9)| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.2591122 | 0.3396217 | 0.3593299 | 0.3723662 | 0.3775448 | 0.3818357 | 0.3831096 | 0.3828078 | 0.3827992 | 0.3828375 | 0.3831521 |
5.1.1.5 Mellow’s Cp
cpmat <- data.frame(t(OLSregbacksum$cp))
cpmat %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F,
position = "center", font_size = 9)| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 |
|---|---|---|---|---|---|---|---|---|---|---|
| 257.7822 | 92.69342 | 53.04641 | 27.18921 | 17.52635 | 9.703904 | 8.087153 | 9.706003 | 10.7232 | 11.64424 | 12 |
5.1.1.6 Bayesian Information Criterion (BIC)
bicmat <- data.frame(t(OLSregbacksum$bic))
bicmat %>%
kbl() %>%
kable_styling(bootstrap_options = "striped", full_width = F,
position = "center", font_size = 9)| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 |
|---|---|---|---|---|---|---|---|---|---|---|
| -368.7818 | -509.19 | -541.6428 | -561.6869 | -566.0969 | -568.7647 | -565.249 | -558.4827 | -552.3224 | -546.2598 | -540.7687 |
5.1.1.7 Combining the Model Output Matrices
# Combine the four data frames into one
combined_df <- data.frame(
R2 = unlist(rsqmat),
Adj_R2 = unlist(adjr2mat),
Mellow_Cp = unlist(cpmat),
BIC = unlist(bicmat)
)
# Create a kable table with all the values
kable(combined_df, format = "html", col.names = c("R-sq.", "Adj R-sq.", "Mellow's Cp", "BIC")) %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center", font_size = 9)| R-sq. | Adj R-sq. | Mellow's Cp | BIC | |
|---|---|---|---|---|
| X1 | 0.2596942 | 0.2591122 | 257.782198 | -368.7818 |
| X2 | 0.3406592 | 0.3396217 | 92.693417 | -509.1900 |
| X3 | 0.3608397 | 0.3593299 | 53.046411 | -541.6428 |
| X4 | 0.3743383 | 0.3723662 | 27.189213 | -561.6869 |
| X5 | 0.3799897 | 0.3775448 | 17.526348 | -566.0969 |
| X6 | 0.3847493 | 0.3818357 | 9.703904 | -568.7647 |
| X7 | 0.3865018 | 0.3831096 | 8.087153 | -565.2490 |
| X8 | 0.3866865 | 0.3828078 | 9.706003 | -558.4827 |
| X9 | 0.3871627 | 0.3827992 | 10.723197 | -552.3224 |
| X10 | 0.3876856 | 0.3828375 | 11.644237 | -546.2598 |
| X11 | 0.3884823 | 0.3831521 | 12.000000 | -540.7687 |
5.1.1.8 Diagnostic plots for the Backward Selection
par(mfrow = c(2, 2))
plot(OLSregbacksum$rss, xlab = "Number of Variables\n(a)", ylab = "RSS",
type = "l", lwd = 1.5, cex.main = 1.15, cex.lab = 1, cex.axis = 1.05,
font.axis = 2, font.lab = 2, main = "Number of Variables Vs. RSS",
panel.first = grid(nx = NULL, ny = NULL, col = "gray", lty = 2))
plot(OLSregbacksum$adjr2, xlab = "Number of Variables\n(b)",
ylab = "Adjusted RSq", type = "l", lwd = 1.5, main = "Number of Variables Vs. Adjusted R2",
cex.main = 1.15, cex.lab = 1, cex.axis = 1.05, font.axis = 2,
font.lab = 2, panel.first = grid(nx = NULL, ny = NULL, col = "gray",
lty = 2))
points(6, OLSregbacksum$adjr2[6], col = "#336699", cex = 2, pch = 20)
plot(OLSregbacksum$cp, xlab = "Number of Variables\n(c)", ylab = "Cp",
type = "l", lwd = 1.5, cex.main = 1.15, cex.lab = 1, cex.axis = 1.05,
font.axis = 2, font.lab = 2, main = "Number of Variables Vs. Mellow's Cp",
panel.first = grid(nx = NULL, ny = NULL, col = "gray", lty = 2))
points(6, OLSregbacksum$cp[6], col = "#336699", cex = 2, pch = 20)
plot(OLSregbacksum$bic, xlab = "Number of Variables\n(d)", ylab = "BIC",
type = "l", lwd = 1.5, cex.main = 1.15, cex.lab = 1, cex.axis = 1.05,
font.axis = 2, font.lab = 2, main = "Number of Variables Vs. BIC",
panel.first = grid(nx = NULL, ny = NULL, col = "gray", lty = 2))
points(6, OLSregbacksum$bic[6], col = "#336699", cex = 2, pch = 20)5.1.1.9 Best Model from Backward Selection
# Find the model with the lowest BIC value
best_model_index <- which.min(OLSregbacksum$bic)
# Get the best model
best_model <- coef(OLS.regback, id = best_model_index)
# Extract the coefficients from the best model and remove the intercept term from the coefficients
best_coefficients <- round(unname(best_model)[-1], 4)
# Get the variable names from your redDF dataframe
variable_names <- names(best_model)[-1]
# Extract the intercept coefficient separately
intercept_coefficient <- round(unname(best_model)[1], 4)
# Define the named_coefficients variable
named_coefficients <- setNames(best_coefficients, variable_names)
# Create the LaTeX equation
equation <- paste("y =", intercept_coefficient, "+", paste(named_coefficients, variable_names, sep = " * ", collapse = " + "), "")$y = 3.9765 + -0.9888 * volatile.acidity + -1.6884 * chlorides + -0.0029 * total.sulfur.dioxide + -0.392 * pH + 0.8807 * sulphates + 0.3083 * alcohol $
5.1.2 Forward Approach
Following the leaps::regsubsets function, we will use the forward approach by changing method = "backward" to method = "forward" to select the best model. Make sure to change all OLS.regback to OLS.regfwd and related variables.
Caution: Running all three subset selection methods in one file will result in longer run times/knit times.
5.1.3 Best Subset Approach
Following the leaps::regsubsets function, we will use the best approach by changing method = "backward" to method = "best" to select the best model. Make sure to change all OLS.regback to OLS.regbest and related variables.
Caution: Running all three subset selection methods in one file will result in longer run times/knit times.
6 Check model performance
6.1 Prepare performance indicator function
# library(MLmetrics)
# indicator <- function(model, y_pred, y_true) {
# adj.r.sq <- summary(model)$adj.r.squared
# mse <- MSE(y_pred, y_true)
# rmse <- RMSE(y_pred, y_true)
# mae <- MAE(y_pred, y_true)
# print(paste0("Adjusted R-squared: ", round(adj.r.sq, 4)))
# print(paste0("MSE: ", round(mse, 4)))
# print(paste0("RMSE: ", round(rmse, 4)))
# print(paste0("MAE: ", round(mae, 4)))
# }
#
# indicator(model = model.both_red, y_pred = model.both_red$fitted.values, y_true = X.train_red$quality)
best_model_back <- lm(quality~ volatile.acidity + chlorides+ total.sulfur.dioxide+ pH+ sulphates + alcohol, data=X.train_red)
stargazer::stargazer(best_model_back, type = "html",title = "Backward Selection Best Model", ci=TRUE, single.row = TRUE, no.space = FALSE, align = TRUE, digits=4, font.size = "small", report = "vc*stp")| Dependent variable: | |
| quality | |
| volatile.acidity | -0.9888*** (-1.2045, -0.7731) |
| t = -8.9846 | |
| p = 0.0000 | |
| chlorides | -1.6884*** (-2.5485, -0.8283) |
| t = -3.8474 | |
| p = 0.0002 | |
| total.sulfur.dioxide | -0.0029*** (-0.0040, -0.0019) |
| t = -5.4365 | |
| p = 0.000000 | |
| pH | -0.3920*** (-0.6374, -0.1466) |
| t = -3.1307 | |
| p = 0.0018 | |
| sulphates | 0.8807*** (0.6426, 1.1188) |
| t = 7.2496 | |
| p = 0.0000 | |
| alcohol | 0.3083*** (0.2733, 0.3433) |
| t = 17.2681 | |
| p = 0.0000 | |
| Constant | 3.9765*** (3.1272, 4.8257) |
| t = 9.1773 | |
| p = 0.0000 | |
| Observations | 1,274 |
| R2 | 0.3847 |
| Adjusted R2 | 0.3818 |
| Residual Std. Error | 0.6348 (df = 1267) |
| F Statistic | 132.0538*** (df = 6; 1267) |
| Note: | p<0.1; p<0.05; p<0.01 |
We compare these predictions from the training set to the test set.
# metrics <- function(y_pred, y_true){
# mse <- MSE(y_pred, y_true)
# rmse <- RMSE(y_pred, y_true)
# mae <- MAE(y_pred, y_true)
# corPredAct <- cor(y_pred, y_true)
# print(paste0("MSE: ", round(mse, 6)))
# print(paste0("RMSE: ", round(rmse, 6)))
# print(paste0("MAE: ", round(mae, 6)))
# print(paste0("Correlation: ", round(corPredAct, 6)))
# print(paste0("R^2 between y_pred & y_true: ", round(corPredAct^2, 6)))
# }
#
# metrics(y_pred = model.both_red$fitted.values, y_true = X.train_red$quality)
#
# redPredict.both <- predict(model.both_red, newdata = X.test_red)
# metrics(y_pred = redPredict.both, y_true = X.test_red$quality)
# Predict on the test data
predictions <- predict(best_model_back, newdata = X.test_red)6.1.1 Plot between Predicted vs. actual values in training set
# redFitted.both <- data.frame(qualityPred = model.both_red$fitted.values,
# qualityAct = X.train_red$quality)
# ggplot(redFitted.both, aes(x = qualityPred, y = qualityAct)) +
# geom_point(aes(color = as.factor(qualityAct)), show.legend = F) +
# geom_smooth(method = "lm", se = F) +
# labs(title = "Predicted vs Actual Values Using Train Dataset", x = "Predicted quality", y = "Actual quality")6.1.2 Plot between Predicted vs. actual values in test set
# redPredict.bothDF <- data.frame(qualityPred = redPredict.both, qualityAct = X.test_red$quality)
# ggplot(redPredict.bothDF, aes(x = qualityPred, y = qualityAct)) +
# geom_point(aes(color = as.factor(qualityAct)), show.legend = F) +
# geom_smooth(method = "lm", se = F) +
# labs(title = "Predicted vs Actual Values Using Test Dataset", x = "Predicted quality", y = "Actual quality")
# Add the predicted values to your test data (X.test_red)
X.test_red$predicted_quality <- predictions
# Load the necessary libraries
library(ggplot2)
# Create a scatterplot to visualize the predicted values vs. actual values
ggplot(X.test_red, aes(x = quality, y = predicted_quality)) +
geom_point() +
labs(title = "Actual vs. Predicted Quality",
x = "Actual Quality",
y = "Predicted Quality") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
theme_minimal()7 Equations
7.1 Causal
\begin{align} ln(y) &= \beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2} \implies y= e^{\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}}\\ \sqrt{y} &=\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}\implies y = (\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2})^2== b_0^2+b_1^2+2*b1b2\\ \sqrt[3]{y} &=\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}\\ cnt &= 161.807 +85.5765 *temp + 314.3430 *atemp - 275.1803 *hum+ 43.000* windspeed\\ causal &= 161.807 +85.5765 *temp + 314.3430 *atemp - 275.1803 *hum+ 43.000* windspeed\\ Registerd &= 161.807 +85.5765 *temp + 314.3430 *atemp - 275.1803 *hum+ 43.000* windspeed \end{align}
lm1 = lm(BC_cnt~.-causal-registered-instant-cnt, data=train)